package com.example.demo.util.craw;

import lombok.Getter;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Description: <br/>
 *
 * @author liulonglong<br                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               />;
 * @taskId <br/>
 * @Date 2019/5/23 13:52
 * @Version 1.0
 */
@Setter
@Getter
public class SimpleCrawlJob extends AbstractJob {

    /**
     * 配置项信息
     */
    private CrawlMeta crawlMeta;

    /**
     * 爬取的结果
     */
    private CrawlResult crawlResult;

    /**
     * 实现逻辑
     *
     * @throws Exception
     */
    @Override public void doFetchPage() throws Exception {
        URL url = new URL(crawlMeta.getUrl());
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        BufferedReader in = null;
        StringBuilder result = new StringBuilder();
        try {
            connection.setRequestProperty("accept", "*/*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 遍历响应头字段
            for (String s : connection.getHeaderFields().keySet()) {
                System.out.println(s + "--->" + connection.getHeaderFields().get(s));
            }
            in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result.append(line);
            }
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        parse(result.toString());
    }

    private void parse(String html) {

        Document document = Jsoup.parse(html);
        Map<String, List<String>> map = new HashMap<>(crawlMeta.getSelectorRules().size());
        for (String rule : crawlMeta.getSelectorRules()) {
            List<String> list = new ArrayList<>();
            for (Element element : document.select(rule)) {
                list.add(element.text());
            }
            map.put(rule, list);
        }
        this.crawlResult = new CrawlResult();
        this.crawlResult.setHtmlDoc(document);
        this.crawlResult.setUrl(crawlMeta.getUrl());
        this.crawlResult.setResult(map);
    }
}
